# load raw data files
data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]
# clean data
data <- clean_data(data) %>% collapse_data()
# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
space_group <- data$SpaceGroup
# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X","Z"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()
# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)
Multinomial Regression
library(glmnet)
X = data[,-1] %>% dummy_cols(select_columns = "SpaceGroup") %>% select(-c(SpaceGroupNumber, SpaceGroup)) %>% as.matrix()
Y = data$GroupCat %>% as.matrix()
Coefficient
Ridge
ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial")
ridge_cv %>%
get_coef(tuning_parameter = ridge_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Others) %>%
plot_coef()

LASSO
lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>%
get_coef(tuning_parameter = lasso_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Others) %>%
plot_coef()

Elastic Net
library(caret)
elastic_cv <-
train(GroupCat ~., data = data, method = "glmnet",
trControl = trainControl("cv", number = 5),
tuneLength = 10
)
elastic_cv$finalModel %>%
get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>%
select(feature, Cubic, Tilted, Others) %>%
plot_coef()

Accurate classification rate
Ridge
tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min)
tb_ridge$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.9459459
|
0.9594595
|
0.96
|
0.9726027
|
0.9466667
|
0.956935
|
tb_ridge$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
178
|
2
|
1
|
|
Others
|
0
|
28
|
5
|
|
Tilted
|
1
|
7
|
149
|
|
Total
|
179
|
37
|
155
|
tb_ridge$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.99
|
0.05
|
0.01
|
|
Others
|
0
|
0.76
|
0.03
|
|
Tilted
|
0.01
|
0.19
|
0.96
|
|
Total
|
100%
|
100%
|
100%
|
tb_ridge$t %>%
as.data.frame() %>%
arrange(desc(Freq))
## Var1 Var2 Freq
## 1 Cubic Cubic 178
## 2 Tilted Tilted 149
## 3 Others Others 28
## 4 Tilted Others 7
## 5 Others Tilted 5
## 6 Cubic Others 2
## 7 Tilted Cubic 1
## 8 Cubic Tilted 1
## 9 Others Cubic 0
LASSO
tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min)
tb_lasso$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.9864865
|
0.9459459
|
0.96
|
0.9726027
|
0.9333333
|
0.9596737
|
tb_lasso$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
178
|
1
|
0
|
|
Others
|
0
|
29
|
6
|
|
Tilted
|
1
|
7
|
149
|
|
Total
|
179
|
37
|
155
|
tb_lasso$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.99
|
0.03
|
0
|
|
Others
|
0
|
0.78
|
0.04
|
|
Tilted
|
0.01
|
0.19
|
0.96
|
|
Total
|
100%
|
100%
|
100%
|
Elastic Net
tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]])
tb_elastic$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.9864865
|
0.972973
|
0.96
|
0.9589041
|
0.9466667
|
0.965006
|
tb_elastic$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
178
|
1
|
0
|
|
Others
|
0
|
29
|
4
|
|
Tilted
|
1
|
7
|
151
|
|
Total
|
179
|
37
|
155
|
tb_elastic$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.99
|
0.03
|
0
|
|
Others
|
0
|
0.78
|
0.03
|
|
Tilted
|
0.01
|
0.19
|
0.97
|
|
Total
|
100%
|
100%
|
100%
|